1 Old 29 Training sets

1.1 Boxplot Top N: None vs Elbow vs Median

top_none <- read.table("~/Box/SCRMshaw_HD_all29Sets_allMethods_Jan2020/pCRMeval/top_noneAmplCurve/output_top_none_oldAllSetsAllMethods.bed",header = TRUE)
top_median_imm <- read.table("~/Box/SCRMshaw_HD_all29Sets_allMethods_Jan2020/pCRMeval/medianAmplitudeCurve/output_topMedianAmp_allold29sets_IMM.bed",header = TRUE)
top_elbow_imm <- read.table("~/Box/SCRMshaw_HD_all29Sets_allMethods_Jan2020/pCRMeval/elbowAmplitudeCurve/output_topAll29_elbow_IMM.bed",header = TRUE)
top_none_imm <-  subset(top_none,top_none$Method=="imm")

  boxplot(top_none_imm$PercentageTrainingSetSensitivity*100,top_median_imm$PercentageTrainingSetSensitivity*100,top_elbow_imm$PercentageTrainingSetSensitivity*100,top_none_imm$PercentageRedflyRecovered*100,top_median_imm$PercentageRedflyRecovered*100,top_elbow_imm$PercentageRedflyRecovered*100,top_none_imm$percentageExpressionPatternPrecision*100,top_median_imm$percentageExpressionPatternPrecision*100,top_elbow_imm$percentageExpressionPatternPrecision*100,names = c("TS_None","TS_Med","TS_Elb","RR_None","RR_Med","RR_Elb","EP_None","EP_Med","EP_Elb"),main="Percentage_comparison_3methods_allmeasures_new74sets",boxwex=0.6,col = c("orange","red","blue","orange","red","blue","orange","red","blue"),ylim=c(0,100))

1.2 Dotplots

1.2.0.1 Training set Sensitivity

library(ggplot2)
#Training set Sensitivity
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=PercentageTrainingSetSensitivity*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=PercentageTrainingSetSensitivity*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=PercentageTrainingSetSensitivity*100,colour="Elbow"),pch=8)+
  scale_colour_manual(values = c("orange","red","blue","black"))+
  ylim(0,100)+
  theme_bw()

1.2.0.2 Redfly Recovery

#Redfly Recovery
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=PercentageRedflyRecovered*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=PercentageRedflyRecovered*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=PercentageRedflyRecovered*100,colour="Elbow"),pch=8)+
  scale_colour_manual(values = c("orange","red","blue"))+
  ylim(0,100)+
  theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).

1.2.0.3 Expression Pattern Precision

#Pattern Precision
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=percentageExpressionPatternPrecision*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=percentageExpressionPatternPrecision*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=percentageExpressionPatternPrecision*100,colour="Elbow"),pch=8)+
  ylim(0,100)+
  scale_colour_manual(values = c("orange","red","blue"))+
  theme_bw()

1.2.0.4 Expression Pattern Recall

#Pattern Precision permuted comparison
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=percentageExpressionPatternRecall*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=ExpectedpercentageExpressionPatternRecall*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=ExpectedpercentageExpressionPatternRecall*100,colour="Elbow"),pch=8)+

  ylim(0,100)+
  scale_colour_manual(values = c("orange","red","blue"))+
  theme_bw()

1.3 Old Training set classification

1.3.1 Top Median of Amplitude curve

#random 29
top_fake <-read.table("/Users/hasibaasma/Box/complete_data_representation_newMethod/files/del_randomAvg62sets_28times_Excel_withSpec.bed",header = T)

top_orig_oldSets_Med<- top_median_imm

s1 <- top_orig_oldSets_Med[order(top_orig_oldSets_Med$TsetName),]
s2 <- top_fake[order(top_fake$TsetName),]
subsetIMMevaluationOutputContRand1000_Actual1000_diff<- cbind.data.frame(s1$TsetName,s1$Method,s1$TsetSize,s1$SCRMs,s1$TrainingSetRecovered,s1$PercentageTrainingSetSensitivity,s1$REDflyRecovered,s1$PercentageRedflyRecovered,s1$percentageExpressionPatternRecall-s2$PercentageSpecificity,s2$SCRMs,s2$TrainingSetRecovered,s2$PercentageTrainingSetSensitivity,s2$REDflyRecovered,s2$PercentageRedflyRecovered,s1$PercentageTrainingSetSensitivity - s2$PercentageTrainingSetSensitivity,s1$PercentageRedflyRecovered - s2$PercentageRedflyRecovered)
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[1] <-"TsetName"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[4] <-"SCRMs"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[9] <-"DifferenceBetweenActualRandomPercentageSpecificity"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[15] <- "DifferenceBetweenActualRandomTsetRecovery"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[16] <- "DifferenceBetweenActualRandomRedflyRecovery"

##checking..creating ranges of differences to write in the observations
minimum_differenceBetweenActualRandomRedflyRecovery <- list()
maximum_differenceBetweenActualRandomRedflyRecovery <- list()
median_differenceBetweenActualRandomRedflyRecovery <- list()
minimum_DifferenceBetweenActualTsetRecovery<- list()
maximum_DifferenceBetweenActualTsetRecovery<- list()
median_DifferenceBetweenActualTsetRecovery<- list()
minimum_specificity<- list()
maximum_specificity<- list()
median_specificity<- list()
for(i in unique(subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName)){
  minimum_differenceBetweenActualRandomRedflyRecovery[[i]] <- min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_differenceBetweenActualRandomRedflyRecovery[[i]] <- median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_differenceBetweenActualRandomRedflyRecovery[[i]] <-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  minimum_DifferenceBetweenActualTsetRecovery[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_DifferenceBetweenActualTsetRecovery[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_DifferenceBetweenActualTsetRecovery[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
  minimum_specificity[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_specificity[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_specificity[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
}
library(qdapTools)
minred <- list2df(minimum_differenceBetweenActualRandomRedflyRecovery,"minimum difference to random in redfly recovery","TsetName")
medred <-list2df(median_differenceBetweenActualRandomRedflyRecovery,"median difference to random in redfly recovery","TsetName")
maxred<-list2df(maximum_differenceBetweenActualRandomRedflyRecovery,"maximum difference to random in redfly recovery","TsetName")
minTset<-list2df(minimum_DifferenceBetweenActualTsetRecovery,"minimum difference to random in Tset sensitivity","TsetName")
maxTset<- list2df(maximum_DifferenceBetweenActualTsetRecovery,"maximum difference to random in Tset sensitivity","TsetName")
medTset<-list2df(median_DifferenceBetweenActualTsetRecovery,"median difference to random in Tset sensitivity","TsetName")
minSpec<-list2df(minimum_specificity,"minimum difference to random in specificity","TsetName")
maxSpec<-list2df(maximum_specificity,"maximum difference to random in specificity","TsetName")
medSpec <-list2df(median_specificity,"median difference to random in specificity","TsetName")

df1<- merge(medred,medTset,by="TsetName")
dffinal<-merge(df1,medSpec,by="TsetName")
#dffinal<-merge(df7,by="TsetName")




#data.frame(unclass(table(dffinal)))
#table(dffinal)
write.table(dffinal,file="~/Box/Old_and_newTsets_3postProcMethods_3categories/oldTsets/finaltable_medianAmplitudeCurve_old29tsets.txt",sep = "\t")

library(knitr)
library(kableExtra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdapTools':
## 
##     id
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1.3.1.1 Good Tset

Conditions being set for these good Tsets include:

  • Difference To Random in Redfly Recovery must be greater than or equal to 8percent at cutoff,
  • Difference to Random in Tset Sensitivity must be greater than or equalt to 10percent,
  • Median of Specificity must be greater than or equalt to 8percent
finaltable<- list()
# 
#setting up conditions Good Tsets OverAll
dffinalDF <- as.data.frame(dffinal)


colnames(dffinalDF) <-c("TsetName","MedianDifferenceToRandomOfRedflyRecovery","MedianDifferenceInTsetSensitivity","MedianDifferenceInTsetSpecificity")


goodTsetOverAll<- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8,  MedianDifferenceInTsetSensitivity >=10,MedianDifferenceInTsetSpecificity >=8)



goodTsetsOverAll <- list()
colnames(goodTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

goodTsetOverAll<- cbind(goodTsetOverAll[1],goodTsetOverAll[2],goodTsetOverAll[3],goodTsetOverAll[4])
kable_input3<- kable(goodTsetOverAll,digits = 2,caption = "Overall Good Training Sets") 
#column_spec(kable_input3,2:10,width = "1cm")
kable_input3
Overall Good Training Sets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
#goodTsetOverAll

1.3.1.2 Good Tsets Ignoring Specificity

Conditions being set for these good Tsets(if we ignore specificity) include:

  • Difference To Random in redfly recovery must be greater than or equal to 10percent at cutoff ,
  • Difference In Tset Sensitivity to random must be greater than or equalt to 10.
goodTsetsIgnoringSpecificity <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=10,MedianDifferenceInTsetSensitivity>=10)
colnames(goodTsetsIgnoringSpecificity) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
  goodTsetsIgnoringSpecificity <- cbind(goodTsetsIgnoringSpecificity[1],goodTsetsIgnoringSpecificity[2],goodTsetsIgnoringSpecificity[3],goodTsetsIgnoringSpecificity[4])
kable_input4<- kable(goodTsetsIgnoringSpecificity,digits = 2,caption = "Good Training Sets Ignoring Poor Specificity")
#column_spec(kable_input4,2:10,width = "2cm")
kable_input4
Good Training Sets Ignoring Poor Specificity
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.blastoderm 17.59 10.83 5.85
mapping1.glia 12.33 58.39 -1.97
mapping1.malpighian_tubules 10.51 41.72 -7.70
mapping1.tracheal_system 17.34 34.58 -3.49
mapping1.ventral_ectoderm 11.32 36.16 -6.08
mapping1.visceral_mesoderm 23.12 22.67 -5.28
mapping2.glia 17.97 58.39 -1.25

1.3.1.3 Intermediate

These are the sets basically, that do not fall into very good or very bad training sets, mainly because they perform good in two measures but not in the third one. Excluded those sets which were already categorized as good.

Specific conditions being set for these Intermediate sets include:

  • Difference To Random in redfly recovery must be grater than 8 and Difference to random in Tset Sensitivity must be greater than 10 but that of specificity could be less than 10, OR
  • Difference To Random in redfly recovery must be greater than 7 and specificity must be greater than 5 but Difference to random in Tset Sensitivity could be less than 10 but not less than random, OR
  • Difference To random in Tset Sensitivity and specificity must be greater than 10 but Difference to random in redfly recovery could be less than 10 but not less than random.
intermediateTSets <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=10 &  MedianDifferenceInTsetSpecificity >=0 | MedianDifferenceToRandomOfRedflyRecovery >=0 & MedianDifferenceInTsetSensitivity >=10 &  MedianDifferenceInTsetSpecificity >=5 |MedianDifferenceToRandomOfRedflyRecovery >=7 & MedianDifferenceInTsetSensitivity >=0 &  MedianDifferenceInTsetSpecificity >=5 )
onlyintermediateTSets=data.frame()

onlyintermediateTSetsNum <-    which(!(intermediateTSets$TsetName %in% goodTsetOverAll$TsetName))
  for (i in 1:length(onlyintermediateTSetsNum)){
         onlyintermediateTSetIter <- intermediateTSets[onlyintermediateTSetsNum[i],]
        onlyintermediateTSets <- rbind(onlyintermediateTSets,onlyintermediateTSetIter)       

}



intermediateTSets <- onlyintermediateTSets
colnames(intermediateTSets) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
 intermediateTSets <- cbind(intermediateTSets[1],intermediateTSets[2],intermediateTSets[3],intermediateTSets[4])
kable_input4<- kable(intermediateTSets,digits = 2,caption = "Intermediate Tsets") 
#column_spec(kable_input4,2:10,width = "3cm")
kable_input4
Intermediate Tsets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.blastoderm 17.59 10.83 5.85

1.3.1.4 Poor Tsets

Conditions being set for these poor sets include. If a set fullfills any of the following condition

  • Difference To Random in redfly recovery is less than 0 OR ,
  • Difference In Tset Sensitivity is less than or equalt to 0 OR,
  • Specificity is less than or equalt to 0,
#poor Tsets

poorTsetOverAll<- filter(dffinalDF, MedianDifferenceToRandomOfRedflyRecovery <=0 |MedianDifferenceInTsetSensitivity <= 0 | MedianDifferenceInTsetSpecificity<=0 )

colnames(poorTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

poorTsetOverAll<- cbind(poorTsetOverAll[1],poorTsetOverAll[2],poorTsetOverAll[3],poorTsetOverAll[4])
kable_input6<- kable(poorTsetOverAll,digits = 2,caption = "Overall Poor Training Sets")
#column_spec(kable_input6,2:10,width = "2cm")
kable_input6
Overall Poor Training Sets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.adult_mesoderm 8.53 20.89 -3.92
mapping1.amnioserosa -0.56 33.39 -8.14
mapping1.cns 0.48 -9.31 -8.15
mapping1.dorsal_ectoderm 6.12 43.00 -5.91
mapping1.ectoderm 9.89 9.33 -6.50
mapping1.endoderm 17.95 3.21 -7.34
mapping1.eye 4.80 33.39 -7.76
mapping1.female_gonad -8.96 -3.15 -6.64
mapping1.glia 12.33 58.39 -1.97
mapping1.imaginal_disc 6.14 -9.15 -5.96
mapping1.male_gonad -3.17 5.05 -9.58
mapping1.malpighian_tubules 10.51 41.72 -7.70
mapping1.mesectoderm 7.93 58.39 -6.13
mapping1.mesoderm 0.25 23.77 -5.90
mapping1.neuroectoderm 7.29 36.16 -4.78
mapping1.pns 3.71 6.66 -7.46
mapping1.salivary_gland 23.03 1.24 -5.26
mapping1.somatic_muscle 8.84 5.61 -4.00
mapping1.tracheal_system 17.34 34.58 -3.49
mapping1.ventral_ectoderm 11.32 36.16 -6.08
mapping1.visceral_mesoderm 23.12 22.67 -5.28
mapping2.ectoderm 3.07 0.28 -6.39
mapping2.eye 3.15 11.96 -7.62
mapping2.glia 17.97 58.39 -1.25
mapping2.mesoderm 15.81 -13.61 -6.35
mapping2.neuronal -3.80 -10.69 -8.32
mapping2.reproductive_system 12.95 -18.28 -9.58
mapping2.wing 8.93 6.01 -5.97

1.3.2 Top Elbow of Amplitude curve

#random 29
top_fake <-read.table("/Users/hasibaasma/Box/complete_data_representation_newMethod/files/del_randomAvg62sets_28times_Excel_withSpec.bed",header = T)

top_orig_oldSets_Med<- top_elbow_imm

s1 <- top_orig_oldSets_Med[order(top_orig_oldSets_Med$TsetName),]
s2 <- top_fake[order(top_fake$TsetName),]
subsetIMMevaluationOutputContRand1000_Actual1000_diff<- cbind.data.frame(s1$TsetName,s1$Method,s1$TsetSize,s1$SCRMs,s1$TrainingSetRecovered,s1$PercentageTrainingSetSensitivity,s1$REDflyRecovered,s1$PercentageRedflyRecovered,s1$percentageExpressionPatternRecall-s2$PercentageSpecificity,s2$SCRMs,s2$TrainingSetRecovered,s2$PercentageTrainingSetSensitivity,s2$REDflyRecovered,s2$PercentageRedflyRecovered,s1$PercentageTrainingSetSensitivity - s2$PercentageTrainingSetSensitivity,s1$PercentageRedflyRecovered - s2$PercentageRedflyRecovered)
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[1] <-"TsetName"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[4] <-"SCRMs"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[9] <-"DifferenceBetweenActualRandomPercentageSpecificity"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[15] <- "DifferenceBetweenActualRandomTsetRecovery"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[16] <- "DifferenceBetweenActualRandomRedflyRecovery"

##checking..creating ranges of differences to write in the observations
minimum_differenceBetweenActualRandomRedflyRecovery <- list()
maximum_differenceBetweenActualRandomRedflyRecovery <- list()
median_differenceBetweenActualRandomRedflyRecovery <- list()
minimum_DifferenceBetweenActualTsetRecovery<- list()
maximum_DifferenceBetweenActualTsetRecovery<- list()
median_DifferenceBetweenActualTsetRecovery<- list()
minimum_specificity<- list()
maximum_specificity<- list()
median_specificity<- list()
for(i in unique(subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName)){
  minimum_differenceBetweenActualRandomRedflyRecovery[[i]] <- min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_differenceBetweenActualRandomRedflyRecovery[[i]] <- median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_differenceBetweenActualRandomRedflyRecovery[[i]] <-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  minimum_DifferenceBetweenActualTsetRecovery[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_DifferenceBetweenActualTsetRecovery[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_DifferenceBetweenActualTsetRecovery[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
  minimum_specificity[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_specificity[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_specificity[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
}
library(qdapTools)
minred <- list2df(minimum_differenceBetweenActualRandomRedflyRecovery,"minimum difference to random in redfly recovery","TsetName")
medred <-list2df(median_differenceBetweenActualRandomRedflyRecovery,"median difference to random in redfly recovery","TsetName")
maxred<-list2df(maximum_differenceBetweenActualRandomRedflyRecovery,"maximum difference to random in redfly recovery","TsetName")
minTset<-list2df(minimum_DifferenceBetweenActualTsetRecovery,"minimum difference to random in Tset sensitivity","TsetName")
maxTset<- list2df(maximum_DifferenceBetweenActualTsetRecovery,"maximum difference to random in Tset sensitivity","TsetName")
medTset<-list2df(median_DifferenceBetweenActualTsetRecovery,"median difference to random in Tset sensitivity","TsetName")
minSpec<-list2df(minimum_specificity,"minimum difference to random in specificity","TsetName")
maxSpec<-list2df(maximum_specificity,"maximum difference to random in specificity","TsetName")
medSpec <-list2df(median_specificity,"median difference to random in specificity","TsetName")

df1<- merge(medred,medTset,by="TsetName")
dffinal<-merge(df1,medSpec,by="TsetName")
#dffinal<-merge(df7,by="TsetName")




#data.frame(unclass(table(dffinal)))
#table(dffinal)
write.table(dffinal,file="~/Box/Old_and_newTsets_3postProcMethods_3categories/oldTsets/finaltable_elbowAmplitudeCurve_old29tsets.txt",sep = "\t")

library(knitr)
library(kableExtra)
library(dplyr)

1.3.2.1 Good Tset

Conditions being set for these good Tsets include:

  • Difference To Random in Redfly Recovery must be greater than or equal to 8percent at cutoff,
  • Difference to Random in Tset Sensitivity must be greater than or equalt to 10percent,
  • Median of Specificity must be greater than or equalt to 8percent
finaltable<- list()
# 
#setting up conditions Good Tsets OverAll
dffinalDF <- as.data.frame(dffinal)


colnames(dffinalDF) <-c("TsetName","MedianDifferenceToRandomOfRedflyRecovery","MedianDifferenceInTsetSensitivity","MedianDifferenceInTsetSpecificity")


goodTsetOverAll<- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8,  MedianDifferenceInTsetSensitivity >=10,MedianDifferenceInTsetSpecificity >=8)



goodTsetsOverAll <- list()
colnames(goodTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

goodTsetOverAll<- cbind(goodTsetOverAll[1],goodTsetOverAll[2],goodTsetOverAll[3],goodTsetOverAll[4])
kable_input3<- kable(goodTsetOverAll,digits = 2,caption = "Overall Good Training Sets") 
#column_spec(kable_input3,2:10,width = "2cm")

kable_input3
Overall Good Training Sets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff

1.3.2.2 Good Tsets Ignoring Specificity

Conditions being set for these good Tsets(if we ignore specificity) include:

  • Difference To Random in redfly recovery must be greater than or equal to 10percent at cutoff ,
  • Difference In Tset Sensitivity to random must be greater than or equalt to 10.
goodTsetsIgnoringSpecificity <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=10,MedianDifferenceInTsetSensitivity>=10)
colnames(goodTsetsIgnoringSpecificity) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
  goodTsetsIgnoringSpecificity <- cbind(goodTsetsIgnoringSpecificity[1],goodTsetsIgnoringSpecificity[2],goodTsetsIgnoringSpecificity[3],goodTsetsIgnoringSpecificity[4])
kable_input4<- kable(goodTsetsIgnoringSpecificity,digits = 2,caption = "Good Training Sets Ignoring Poor Specificity")
#column_spec(kable_input4,2:10,width = "2cm")
kable_input4
Good Training Sets Ignoring Poor Specificity
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.adult_mesoderm 54.97 20.89 -5.81
mapping1.amnioserosa 84.25 33.39 -8.50
mapping1.dorsal_ectoderm 41.76 35.31 -7.89
mapping1.eye 61.63 33.39 -5.95
mapping1.glia 43.14 58.39 -1.97
mapping1.malpighian_tubules 21.82 25.05 -8.64
mapping1.mesectoderm 47.39 58.39 -5.27
mapping1.mesoderm 13.56 12.23 -7.48
mapping1.neuroectoderm 49.97 36.16 -7.98
mapping1.tracheal_system 30.38 29.82 -6.00
mapping1.ventral_ectoderm 26.29 30.61 -7.66
mapping1.visceral_mesoderm 40.18 11.96 -6.23
mapping2.glia 48.66 58.39 -2.29

1.3.2.3 Intermediate

These are the sets basically, that do not fall into very good or very bad training sets, mainly because they perform good in two measures but not in the third one. Excluded those sets which were already categorized as good.

Specific conditions being set for these Intermediate sets include:

  • Difference To Random in redfly recovery must be grater than 8 and Difference to random in Tset Sensitivity must be greater than 10 but that of specificity could be less than 10, OR
  • Difference To Random in redfly recovery must be greater than 7 and specificity must be greater than 5 but Difference to random in Tset Sensitivity could be less than 10 but not less than random, OR
  • Difference To random in Tset Sensitivity and specificity must be greater than 10 but Difference to random in redfly recovery could be less than 10 but not less than random.
intermediateTSets <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=10 &  MedianDifferenceInTsetSpecificity >=0 | MedianDifferenceToRandomOfRedflyRecovery >=0 & MedianDifferenceInTsetSensitivity >=10 &  MedianDifferenceInTsetSpecificity >=5 |MedianDifferenceToRandomOfRedflyRecovery >=7 & MedianDifferenceInTsetSensitivity >=0 &  MedianDifferenceInTsetSpecificity >=5 )
onlyintermediateTSets=data.frame()

onlyintermediateTSetsNum <-    which(!(intermediateTSets$TsetName %in% goodTsetOverAll$TsetName))
  for (i in 1:length(onlyintermediateTSetsNum)){
         onlyintermediateTSetIter <- intermediateTSets[onlyintermediateTSetsNum[i],]
        onlyintermediateTSets <- rbind(onlyintermediateTSets,onlyintermediateTSetIter)       

}



intermediateTSets <- onlyintermediateTSets
colnames(intermediateTSets) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
 intermediateTSets <- cbind(intermediateTSets[1],intermediateTSets[2],intermediateTSets[3],intermediateTSets[4])
kable_input4<- kable(intermediateTSets,digits = 2,caption = "Intermediate Tsets") 
#column_spec(kable_input4,2:10,width = "3cm")
kable_input4
Intermediate Tsets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
NA NA NA NA NA

1.3.2.4 Poor Tsets

Conditions being set for these poor sets include. If a set fullfills any of the following condition:

  • Difference To Random in redfly recovery is less than 0 OR ,
  • Difference In Tset Sensitivity is less than or equalt to 0 OR,
  • Specificity is less than or equalt to 0,
#poor Tsets

poorTsetOverAll<- filter(dffinalDF, MedianDifferenceToRandomOfRedflyRecovery <=0 |MedianDifferenceInTsetSensitivity <= 0 | MedianDifferenceInTsetSpecificity<=0 )

colnames(poorTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

poorTsetOverAll<- cbind(poorTsetOverAll[1],poorTsetOverAll[2],poorTsetOverAll[3],poorTsetOverAll[4])
kable_input6<- kable(poorTsetOverAll,digits = 2,caption = "Overall Poor Training Sets")
#column_spec(kable_input6,2:10,width = "2cm")
kable_input6
Overall Poor Training Sets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.adult_mesoderm 54.97 20.89 -5.81
mapping1.amnioserosa 84.25 33.39 -8.50
mapping1.cns 0.49 -20.07 -8.82
mapping1.dorsal_ectoderm 41.76 35.31 -7.89
mapping1.ectoderm 10.20 -9.54 -7.92
mapping1.endoderm 51.79 -3.68 -9.17
mapping1.eye 61.63 33.39 -5.95
mapping1.female_gonad 5.26 -10.84 -6.64
mapping1.glia 43.14 58.39 -1.97
mapping1.imaginal_disc 12.33 -16.94 -6.62
mapping1.male_gonad 21.82 5.05 -7.83
mapping1.malpighian_tubules 21.82 25.05 -8.64
mapping1.mesectoderm 47.39 58.39 -5.27
mapping1.mesoderm 13.56 12.23 -7.48
mapping1.neuroectoderm 49.97 36.16 -7.98
mapping1.pns 11.43 -3.68 -7.79
mapping1.salivary_gland 54.58 -5.90 -7.42
mapping1.somatic_muscle 8.89 -2.72 -6.33
mapping1.tracheal_system 30.38 29.82 -6.00
mapping1.ventral_ectoderm 26.29 30.61 -7.66
mapping1.visceral_mesoderm 40.18 11.96 -6.23
mapping2.ectoderm 4.16 -5.13 -7.46
mapping2.eye 19.36 1.24 -8.93
mapping2.glia 48.66 58.39 -2.29
mapping2.mesoderm 28.77 -17.61 -7.18
mapping2.neuronal 8.06 -20.99 -9.17
mapping2.reproductive_system 34.03 -21.61 -8.86
mapping2.wing 9.88 -3.52 -2.96

1.3.3 Top None of Amplitude curve

#random 29
top_fake <-read.table("/Users/hasibaasma/Box/complete_data_representation_newMethod/files/del_randomAvg62sets_28times_Excel_withSpec.bed",header = T)

top_orig_oldSets_Med<- top_none_imm

s1 <- top_orig_oldSets_Med[order(top_orig_oldSets_Med$TsetName),]
s2 <- top_fake[order(top_fake$TsetName),]
subsetIMMevaluationOutputContRand1000_Actual1000_diff<- cbind.data.frame(s1$TsetName,s1$Method,s1$TsetSize,s1$SCRMs,s1$TrainingSetRecovered,s1$PercentageTrainingSetSensitivity,s1$REDflyRecovered,s1$PercentageRedflyRecovered,s1$percentageExpressionPatternRecall-s2$PercentageSpecificity,s2$SCRMs,s2$TrainingSetRecovered,s2$PercentageTrainingSetSensitivity,s2$REDflyRecovered,s2$PercentageRedflyRecovered,s1$PercentageTrainingSetSensitivity - s2$PercentageTrainingSetSensitivity,s1$PercentageRedflyRecovered - s2$PercentageRedflyRecovered)
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[1] <-"TsetName"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[4] <-"SCRMs"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[9] <-"DifferenceBetweenActualRandomPercentageSpecificity"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[15] <- "DifferenceBetweenActualRandomTsetRecovery"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[16] <- "DifferenceBetweenActualRandomRedflyRecovery"

##checking..creating ranges of differences to write in the observations
minimum_differenceBetweenActualRandomRedflyRecovery <- list()
maximum_differenceBetweenActualRandomRedflyRecovery <- list()
median_differenceBetweenActualRandomRedflyRecovery <- list()
minimum_DifferenceBetweenActualTsetRecovery<- list()
maximum_DifferenceBetweenActualTsetRecovery<- list()
median_DifferenceBetweenActualTsetRecovery<- list()
minimum_specificity<- list()
maximum_specificity<- list()
median_specificity<- list()
for(i in unique(subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName)){
  minimum_differenceBetweenActualRandomRedflyRecovery[[i]] <- min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_differenceBetweenActualRandomRedflyRecovery[[i]] <- median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_differenceBetweenActualRandomRedflyRecovery[[i]] <-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  minimum_DifferenceBetweenActualTsetRecovery[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_DifferenceBetweenActualTsetRecovery[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_DifferenceBetweenActualTsetRecovery[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
  minimum_specificity[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_specificity[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_specificity[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
}
library(qdapTools)
minred <- list2df(minimum_differenceBetweenActualRandomRedflyRecovery,"minimum difference to random in redfly recovery","TsetName")
medred <-list2df(median_differenceBetweenActualRandomRedflyRecovery,"median difference to random in redfly recovery","TsetName")
maxred<-list2df(maximum_differenceBetweenActualRandomRedflyRecovery,"maximum difference to random in redfly recovery","TsetName")
minTset<-list2df(minimum_DifferenceBetweenActualTsetRecovery,"minimum difference to random in Tset sensitivity","TsetName")
maxTset<- list2df(maximum_DifferenceBetweenActualTsetRecovery,"maximum difference to random in Tset sensitivity","TsetName")
medTset<-list2df(median_DifferenceBetweenActualTsetRecovery,"median difference to random in Tset sensitivity","TsetName")
minSpec<-list2df(minimum_specificity,"minimum difference to random in specificity","TsetName")
maxSpec<-list2df(maximum_specificity,"maximum difference to random in specificity","TsetName")
medSpec <-list2df(median_specificity,"median difference to random in specificity","TsetName")

df1<- merge(medred,medTset,by="TsetName")
dffinal<-merge(df1,medSpec,by="TsetName")
#dffinal<-merge(df7,by="TsetName")




#data.frame(unclass(table(dffinal)))
#table(dffinal)
write.table(dffinal,file="~/Box/Old_and_newTsets_3postProcMethods_3categories/oldTsets/finaltable_noneAmplitudeCurve_old29tsets.txt",sep = "\t")

library(knitr)
library(kableExtra)
library(dplyr)

1.3.3.1 Good Tset

Conditions being set for these good Tsets include:

  • Difference To Random in Redfly Recovery must be greater than or equal to 8percent at cutoff,
  • Difference to Random in Tset Sensitivity must be greater than or equalt to 10percent,
  • Median of Specificity must be greater than or equalt to 8percent
finaltable<- list()
# 
#setting up conditions Good Tsets OverAll
dffinalDF <- as.data.frame(dffinal)


colnames(dffinalDF) <-c("TsetName","MedianDifferenceToRandomOfRedflyRecovery","MedianDifferenceInTsetSensitivity","MedianDifferenceInTsetSpecificity")


goodTsetOverAll<- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8,  MedianDifferenceInTsetSensitivity >=10,MedianDifferenceInTsetSpecificity >=8)



goodTsetsOverAll <- list()
colnames(goodTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

goodTsetOverAll<- cbind(goodTsetOverAll[1],goodTsetOverAll[2],goodTsetOverAll[3],goodTsetOverAll[4])
kable_input3<- kable(goodTsetOverAll,digits = 2,caption = "Overall Good Training Sets") 
#column_spec(kable_input3,2:10,width = "2cm")
kable_input3
Overall Good Training Sets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff

1.3.3.2 Good Tsets Ignoring Specificity

Conditions being set for these good Tsets(if we ignore specificity) include:

  • Difference To Random in redfly recovery must be greater than or equal to 10percent at cutoff ,
  • Difference In Tset Sensitivity to random must be greater than or equalt to 10.
goodTsetsIgnoringSpecificity <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=10,MedianDifferenceInTsetSensitivity>=10)
colnames(goodTsetsIgnoringSpecificity) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
  goodTsetsIgnoringSpecificity <- cbind(goodTsetsIgnoringSpecificity[1],goodTsetsIgnoringSpecificity[2],goodTsetsIgnoringSpecificity[3],goodTsetsIgnoringSpecificity[4])
kable_input4<- kable(goodTsetsIgnoringSpecificity,digits = 2,caption = "Good Training Sets Ignoring Poor Specificity")
#column_spec(kable_input4,2:10,width = "2cm")
kable_input4
Good Training Sets Ignoring Poor Specificity
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.glia 10.34 58.39 -1.97
mapping1.tracheal_system 13.41 39.34 -1.34
mapping1.visceral_mesoderm 18.26 26.24 -2.88
mapping2.glia 10.09 58.39 -1.25

1.3.3.3 Intermediate

These are the sets basically, that do not fall into very good or very bad training sets, mainly because they perform good in two measures but not in the third one. Excluded those sets which were already categorized as good.

Specific conditions being set for these Intermediate sets include:

  • Difference To Random in redfly recovery must be grater than 8 and Difference to random in Tset Sensitivity must be greater than 10 but that of specificity could be less than 10, OR
  • Difference To Random in redfly recovery must be greater than 7 and specificity must be greater than 5 but Difference to random in Tset Sensitivity could be less than 10 but not less than random, OR
  • Difference To random in Tset Sensitivity and specificity must be greater than 10 but Difference to random in redfly recovery could be less than 10 but not less than random.
intermediateTSets <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=10 &  MedianDifferenceInTsetSpecificity >=0 | MedianDifferenceToRandomOfRedflyRecovery >=0 & MedianDifferenceInTsetSensitivity >=10 &  MedianDifferenceInTsetSpecificity >=5 |MedianDifferenceToRandomOfRedflyRecovery >=7 & MedianDifferenceInTsetSensitivity >=0 &  MedianDifferenceInTsetSpecificity >=5 )
onlyintermediateTSets=data.frame()

onlyintermediateTSetsNum <-    which(!(intermediateTSets$TsetName %in% goodTsetOverAll$TsetName))
  for (i in 1:length(onlyintermediateTSetsNum)){
         onlyintermediateTSetIter <- intermediateTSets[onlyintermediateTSetsNum[i],]
        onlyintermediateTSets <- rbind(onlyintermediateTSets,onlyintermediateTSetIter)       

}



intermediateTSets <- onlyintermediateTSets
colnames(intermediateTSets) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
 intermediateTSets <- cbind(intermediateTSets[1],intermediateTSets[2],intermediateTSets[3],intermediateTSets[4])
kable_input4<- kable(intermediateTSets,digits = 2,caption = "Intermediate Tsets") 
#column_spec(kable_input4,2:10,width = "3cm")
kable_input4
Intermediate Tsets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.blastoderm 5.56 18.14 10.17

1.3.3.4 Poor Tsets

Conditions being set for these poor sets include. If a set fullfills any of the following condition

  • Difference To Random in redfly recovery is less than 0 OR ,
  • Difference In Tset Sensitivity is less than or equalt to 0 OR,
  • Specificity is less than or equalt to 0,
#poor Tsets

poorTsetOverAll<- filter(dffinalDF, MedianDifferenceToRandomOfRedflyRecovery <=0 |MedianDifferenceInTsetSensitivity <= 0 | MedianDifferenceInTsetSpecificity<=0 )

colnames(poorTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

poorTsetOverAll<- cbind(poorTsetOverAll[1],poorTsetOverAll[2],poorTsetOverAll[3],poorTsetOverAll[4])
kable_input6<- kable(poorTsetOverAll,digits = 2,caption = "Overall Poor Training Sets")
#column_spec(kable_input6,2:10,width = "2cm")
kable_input6
Overall Poor Training Sets
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
mapping1.adult_mesoderm 3.86 20.89 -3.92
mapping1.amnioserosa -0.65 33.39 -8.14
mapping1.cns 1.17 1.46 -6.46
mapping1.dorsal_ectoderm 3.59 50.69 -5.35
mapping1.ectoderm 6.08 14.99 -4.75
mapping1.endoderm 11.88 6.66 -6.25
mapping1.eye -0.22 33.39 -7.76
mapping1.female_gonad -11.71 4.54 -5.17
mapping1.glia 10.34 58.39 -1.97
mapping1.imaginal_disc 6.70 10.34 -4.65
mapping1.male_gonad -6.34 5.05 -7.83
mapping1.malpighian_tubules 5.54 41.72 -7.70
mapping1.mesectoderm 9.94 58.39 -5.27
mapping1.mesoderm -0.24 31.46 -3.79
mapping1.neuroectoderm 1.03 36.16 -4.78
mapping1.pns -0.35 15.28 -6.15
mapping1.salivary_gland 18.50 1.24 -3.10
mapping1.somatic_muscle 8.27 8.39 -0.75
mapping1.tracheal_system 13.41 39.34 -1.34
mapping1.ventral_ectoderm 6.77 36.16 -5.03
mapping1.visceral_mesoderm 18.26 26.24 -2.88
mapping2.ectoderm 0.46 11.09 -4.03
mapping2.eye 0.29 11.96 -6.97
mapping2.glia 10.09 58.39 -1.25
mapping2.mesoderm 11.28 -4.61 -4.04
mapping2.neuronal -2.12 -4.50 -7.30
mapping2.reproductive_system 10.57 -11.61 -8.86
mapping2.wing 7.88 20.29 -2.96

2 New 72 Training sets

New 74 training sets constructed on January 2020

2.1 Boxplot New Tsets: None vs Elbow vs Median

top_none <- read.table("~/Box/NewTrainingSetsJan2020/pCRMeval_June20_postProcM/None/output_topNone_allNew74sets_IMM.bed",header = TRUE)

top_median_imm <- read.table("~/Box/NewTrainingSetsJan2020/pCRMeval_June20_postProcM/Median/output_topMed_allNew74sets_IMM.bed",header = TRUE)
#top_median_imm <- read.table("~/Box/output_topMedianAmp_allold29sets_IMM copy.bed",header = TRUE)
top_elbow_imm <- read.table("~/Box/NewTrainingSetsJan2020/pCRMeval_June20_postProcM/elbow/output_topElbow_allNew74sets_IMM.bed",header = TRUE)
#top_elbow_imm <- read.table("~/Box/output_topAll29_elbow copy.bed",header = TRUE)
top_none_imm <-  subset(top_none,top_none$Method=="imm")
  boxplot(top_none_imm$PercentageTrainingSetSensitivity*100,top_median_imm$PercentageTrainingSetSensitivity*100,top_elbow_imm$PercentageTrainingSetSensitivity*100,top_none_imm$PercentageRedflyRecovered*100,top_median_imm$PercentageRedflyRecovered*100,top_elbow_imm$PercentageRedflyRecovered*100,top_none_imm$percentageExpressionPatternPrecision*100,top_median_imm$percentageExpressionPatternPrecision*100,top_elbow_imm$percentageExpressionPatternPrecision*100,names = c("TS_None","TS_Med","TS_Elb","RR_None","RR_Med","RR_Elb","EP_None","EP_Med","EP_Elb"),main="Percentage_comparison_3methods_allmeasures_new74sets",boxwex=0.6,col = c("orange","red","blue","orange","red","blue","orange","red","blue"),ylim=c(0,100))

2.2 Dotplots 3 top N methods

2.2.0.1 Training set Sensitivity

library(ggplot2)
#Training set Sensitivity
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=PercentageTrainingSetSensitivity*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=PercentageTrainingSetSensitivity*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=PercentageTrainingSetSensitivity*100,colour="Elbow"),pch=8)+
  scale_colour_manual(values = c("orange","red","blue","black"))+
  ylim(0,100)+
  theme_bw()

2.2.0.2 Redfly Recovery

#Redfly Recovery
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=PercentageRedflyRecovered*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=PercentageRedflyRecovered*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=PercentageRedflyRecovered*100,colour="Elbow"),pch=8)+
  scale_colour_manual(values = c("orange","red","blue"))+
  ylim(0,100)+
  theme_bw()

2.2.0.3 Expression Pattern Precision

#Pattern Precision
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=percentageExpressionPatternPrecision*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=percentageExpressionPatternPrecision*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=percentageExpressionPatternPrecision*100,colour="Elbow"),pch=8)+
  ylim(0,100)+
  scale_colour_manual(values = c("orange","red","blue"))+
  theme_bw()

2.2.0.4 Expression Pattern Recall

#Pattern Precision permuted comparison
ggplot(top_none_imm)+
  facet_wrap(~TsetName)+  
  geom_point(data=top_none_imm,aes(x=SCRMs,y=percentageExpressionPatternRecall*100,colour="None"),pch=8)+
  geom_point(data=top_median_imm,aes(x=SCRMs,y=ExpectedpercentageExpressionPatternRecall*100,colour="Median"),pch=8)+
  geom_point(data=top_elbow_imm,aes(x=SCRMs,y=ExpectedpercentageExpressionPatternRecall*100,colour="Elbow"),pch=8)+

  ylim(0,100)+
  scale_colour_manual(values = c("orange","red","blue"))+
  theme_bw()

2.3 New Training sets classification

2.3.1 Top Median of Amplitude curve

#random 29
top_fake <-read.table("/Users/hasibaasma/Box/NewTrainingSetsJan2020/R/random62oldRun/old/del_randomMEDIAN62sets_75times_Excel_withSpec.txt",header = T)

top_orig_newMed<- top_median_imm

s1 <- top_orig_newMed[order(top_orig_newMed$TsetName),]
s2 <- top_fake[order(top_fake$TsetName),]
subsetIMMevaluationOutputContRand1000_Actual1000_diff<- cbind.data.frame(s1$TsetName,s1$Method,s1$TsetSize,s1$SCRMs,s1$TrainingSetRecovered,s1$PercentageTrainingSetSensitivity,s1$REDflyRecovered,s1$PercentageRedflyRecovered,s1$percentageExpressionPatternRecall-s1$ExpectedpercentageExpressionPatternRecall,s2$SCRMs,s2$TrainingSetRecovered,s2$PercentageTrainingSetSensitivity,s2$REDflyRecovered,s2$PercentageRedflyRecovered,s1$PercentageTrainingSetSensitivity - s2$PercentageTrainingSetSensitivity,s1$PercentageRedflyRecovered - s2$PercentageRedflyRecovered)
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[1] <-"TsetName"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[4] <-"SCRMs"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[9] <-"DifferenceBetweenActualRandomPercentageSpecificity"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[15] <- "DifferenceBetweenActualRandomTsetRecovery"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[16] <- "DifferenceBetweenActualRandomRedflyRecovery"

##checking..creating ranges of differences to write in the observations
minimum_differenceBetweenActualRandomRedflyRecovery <- list()
maximum_differenceBetweenActualRandomRedflyRecovery <- list()
median_differenceBetweenActualRandomRedflyRecovery <- list()
minimum_DifferenceBetweenActualTsetRecovery<- list()
maximum_DifferenceBetweenActualTsetRecovery<- list()
median_DifferenceBetweenActualTsetRecovery<- list()
minimum_specificity<- list()
maximum_specificity<- list()
median_specificity<- list()
for(i in unique(subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName)){
  minimum_differenceBetweenActualRandomRedflyRecovery[[i]] <- min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_differenceBetweenActualRandomRedflyRecovery[[i]] <- median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_differenceBetweenActualRandomRedflyRecovery[[i]] <-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  minimum_DifferenceBetweenActualTsetRecovery[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_DifferenceBetweenActualTsetRecovery[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_DifferenceBetweenActualTsetRecovery[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
  minimum_specificity[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_specificity[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_specificity[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
}
library(qdapTools)
minred <- list2df(minimum_differenceBetweenActualRandomRedflyRecovery,"minimum difference to random in redfly recovery","TsetName")
medred <-list2df(median_differenceBetweenActualRandomRedflyRecovery,"median difference to random in redfly recovery","TsetName")
maxred<-list2df(maximum_differenceBetweenActualRandomRedflyRecovery,"maximum difference to random in redfly recovery","TsetName")
minTset<-list2df(minimum_DifferenceBetweenActualTsetRecovery,"minimum difference to random in Tset sensitivity","TsetName")
maxTset<- list2df(maximum_DifferenceBetweenActualTsetRecovery,"maximum difference to random in Tset sensitivity","TsetName")
medTset<-list2df(median_DifferenceBetweenActualTsetRecovery,"median difference to random in Tset sensitivity","TsetName")
minSpec<-list2df(minimum_specificity,"minimum difference to random in specificity","TsetName")
maxSpec<-list2df(maximum_specificity,"maximum difference to random in specificity","TsetName")
medSpec <-list2df(median_specificity,"median difference to random in specificity","TsetName")

df1<- merge(medred,medTset,by="TsetName")
dffinal<-merge(df1,medSpec,by="TsetName")
#dffinal<-merge(df7,by="TsetName")




#data.frame(unclass(table(dffinal)))
#table(dffinal)
write.table(dffinal,file="~/Box/Old_and_newTsets_3postProcMethods_3categories/newTsets/finaltable_new74sets_medianAmplitudeCurve.txt",sep = "\t")

library(knitr)
library(kableExtra)
library(dplyr)

2.3.1.1 Good Tsets

Conditions being set for these good Tsets include:

  • Difference To Random in Redfly Recovery must be greater than or equal to 8percent at cutoff,
  • Difference to Random in Tset Sensitivity must be greater than or equalt to 8percent,
  • Difference to Expected in Specificity must be greater than or equalt to 4percent
finaltable<- list()
# 
#setting up conditions Good Tsets OverAll
dffinalDF <- as.data.frame(dffinal)


colnames(dffinalDF) <-c("TsetName","MedianDifferenceToRandomOfRedflyRecovery","MedianDifferenceInTsetSensitivity","MedianDifferenceToPermutedInTsetSpecificity")


goodTsetOverAll<- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8,  MedianDifferenceInTsetSensitivity >=8,MedianDifferenceToPermutedInTsetSpecificity >=4)



goodTsetsOverAll <- list()
colnames(goodTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to permuted in Tset Specificity at cutoff")

goodTsetOverAll<- cbind(goodTsetOverAll[1],goodTsetOverAll[2],goodTsetOverAll[3],goodTsetOverAll[4])
kable_input3<- kable(goodTsetOverAll,digits = 2,caption = "Overall Good Training Sets with TS and RR > 8 and SP > 4") 
#column_spec(kable_input3,2:10,width = "2cm")
kable_input3
Overall Good Training Sets with TS and RR > 8 and SP > 4
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to permuted in Tset Specificity at cutoff
adult_mesoderm.mapping1 32.98 23.33 8.83
adult_muscle 24.87 35.00 10.89
adult_somatic_muscle 27.58 20.00 12.20
blastoderm.mapping1 25.30 32.92 11.09
embryonic_muscle 31.37 21.29 4.17
eye.mapping1 30.36 30.00 5.30
mesectoderm.mapping1 23.68 25.00 4.51
#write.table(goodTsetOverAll,file="~/Box/NewTrainingSetsJan2020/R/goodTsetOverAll.txt",sep = "\t")

2.3.1.2 Good Tsets Ignoring Specificity

Conditions being set for these good Tsets(if we ignore specificity) include:

  • Difference To Random in redfly recovery must be greater than or equal to 10percent at cutoff,
  • Difference In Tset Sensitivity to random must be greater than or equalt to 10.
goodTsetsIgnoringSpecificity <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=10,MedianDifferenceInTsetSensitivity>=10)
colnames(goodTsetsIgnoringSpecificity) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to permuted in Tset Specificity at cutoff")
goodTsetsIgnoringSpecificity <- cbind(goodTsetsIgnoringSpecificity[1],goodTsetsIgnoringSpecificity[2],goodTsetsIgnoringSpecificity[3],goodTsetsIgnoringSpecificity[4])
kable_input4<- kable(goodTsetsIgnoringSpecificity,digits = 2,caption = "Good Training Sets Ignoring Poor Specificity both > 10")
#column_spec(kable_input4,2:10,width = "2cm")
kable_input4
Good Training Sets Ignoring Poor Specificity both > 10
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to permuted in Tset Specificity at cutoff
adult_mesoderm.mapping1 32.98 23.33 8.83
adult_muscle 24.87 35.00 10.89
adult_somatic_muscle 27.58 20.00 12.20
blastoderm.mapping1 25.30 32.92 11.09
cardiac.mapping1 32.50 13.57 3.30
dorsal_ectoderm.mapping1 20.73 19.37 2.16
emb-larv_circulatory_system 27.42 16.00 2.20
emb-larv_hindgut 24.66 45.71 2.70
emb-larv_visceral 23.90 15.56 0.78
embryonic_midgut 31.79 12.94 2.34
embryonic_muscle 31.37 21.29 4.17
embryonic_somatic_muscle 26.82 32.00 3.21
embryonic_trachea 21.54 22.50 3.33
eye.mapping1 30.36 30.00 5.30
fat_body.mapping1 12.75 28.75 2.59
haltere_disc 29.99 15.56 0.64
mesectoderm.mapping1 23.68 25.00 4.51
mesoderm.mapping1 28.11 17.78 3.25
myoblast 20.77 26.67 1.35
ventral_ectoderm.mapping1 15.97 21.90 1.44
#write.table(goodTsetsIgnoringSpecificity,file="~/Box/NewTrainingSetsJan2020/R/goodTsetsIgnoringSpecificity.txt",sep = "\t")

2.3.1.3 Intermediate Tsets

These are the sets basically, that do not fall into very good or very bad training sets, mainly because they perform good in two measures but not in the third one. Not anymore , Excluded those sets which were already categorized as good.

Specific conditions being set for these Intermediate sets include:

  • Difference To Random in redfly recovery must be grater than 8 and Difference to random in Tset Sensitivity must be greater than 8 but that of specificity should just be greater than 0 , OR
  • Difference To Random in redfly recovery must be greater than 8 and specificity must be greater than 4 but Difference to random in Tset Sensitivity should just be greater than 0 , OR
  • Difference To random in Tset Sensitivity and specificity must be greater than 8 and 4 respectively but Difference to random in redfly recovery should just be greater than 0.
intermediateTSets <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=8 &  MedianDifferenceToPermutedInTsetSpecificity >=0 | MedianDifferenceToRandomOfRedflyRecovery >=0 & MedianDifferenceInTsetSensitivity >=8 &  MedianDifferenceToPermutedInTsetSpecificity >=4 |MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=0 &  MedianDifferenceToPermutedInTsetSpecificity >=4 )
onlyintermediateTSets=data.frame()

# onlyintermediateTSetsNum <-    which(!(intermediateTSets$TsetName %in% goodTsetOverAll$TsetName))
# for (i in 1:length(onlyintermediateTSetsNum)){
# onlyintermediateTSetIter <- intermediateTSets[onlyintermediateTSetsNum[i],]
# onlyintermediateTSets <- rbind(onlyintermediateTSets,onlyintermediateTSetIter)       
# 
# }



#intermediateTSets <- onlyintermediateTSets
colnames(intermediateTSets) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
intermediateTSets <- cbind(intermediateTSets[1],intermediateTSets[2],intermediateTSets[3],intermediateTSets[4])
kable_input41<- kable(intermediateTSets,digits = 2,caption = "Intermediate Tsets with all 3 interm") 
#column_spec(kable_input41,2:10,width = "3cm")
kable_input41
Intermediate Tsets with all 3 interm
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
adult_mesoderm.mapping1 32.98 23.33 8.83
adult_muscle 24.87 35.00 10.89
adult_pns 0.94 20.00 5.06
adult_somatic_muscle 27.58 20.00 12.20
amnioserosa.mapping1 31.40 7.37 5.41
antennal_lobe 20.40 8.39 0.99
blastoderm.mapping1 25.30 32.92 11.09
cardiac.mapping1 32.50 13.57 3.30
dorsal_ectoderm.mapping1 20.73 19.37 2.16
emb-larv_circulatory_system 27.42 16.00 2.20
emb-larv_excretory 31.83 10.00 1.96
emb-larv_foregut 22.92 4.44 4.91
emb-larv_hindgut 24.66 45.71 2.70
emb-larv_visceral 23.90 15.56 0.78
embryonic_epidermis 37.52 6.67 4.70
embryonic_midgut 31.79 12.94 2.34
embryonic_muscle 31.37 21.29 4.17
embryonic_pns 26.26 10.00 2.49
embryonic_sense_organ 1.62 26.67 8.16
embryonic_somatic_muscle 26.82 32.00 3.21
embryonic_trachea 21.54 22.50 3.33
eye.mapping1 30.36 30.00 5.30
fat_body.mapping1 12.75 28.75 2.59
haltere_disc 29.99 15.56 0.64
leg_disc 21.28 10.00 1.76
malpig.mapping1 33.72 10.00 2.85
mesectoderm.mapping1 23.68 25.00 4.51
mesoderm.mapping1 28.11 17.78 3.25
myoblast 20.77 26.67 1.35
salivary.mapping1 26.37 2.11 5.81
ventral_ectoderm.mapping1 15.97 21.90 1.44
#write.table(intermediateTSets,file="~/Box/NewTrainingSetsJan2020/R/intermediateTSets.txt",sep = "\t")

2.3.1.4 Poor Tsets

Conditions being set for these poor sets include. If a set is poor in two categories then its a poor set

  • Difference To Random in redfly recovery is less than or equalt to 8 and Difference To Random in Tset Sensitivity is also less than or equalt to 8 OR ,
  • Difference To Random in Tset Sensitivity is less than or equalt to 8 and Difference To Permuted in Tset Specificity is also less than or equalt to 4 OR,
  • Difference Random in redfly recovery is less than or equalt to 8 and Difference To Permuted in Tset Specificity is also less than or equalt to 4 ,
#poor Tsets

poorTsetOverAll<- filter(dffinalDF, MedianDifferenceToRandomOfRedflyRecovery <=8 & MedianDifferenceInTsetSensitivity <= 8 | MedianDifferenceInTsetSensitivity <= 8 & MedianDifferenceToPermutedInTsetSpecificity<=4 |MedianDifferenceToPermutedInTsetSpecificity<=4 & MedianDifferenceToRandomOfRedflyRecovery <=8 )

colnames(poorTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

poorTsetOverAll<- cbind(poorTsetOverAll[1],poorTsetOverAll[2],poorTsetOverAll[3],poorTsetOverAll[4])
kable_input6<- kable(poorTsetOverAll,digits = 2,caption = "Overall Poor Training Sets with any one below 0")
#column_spec(kable_input6,2:10,width = "2cm")
kable_input6
Overall Poor Training Sets with any one below 0
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
adult_brain 12.23 -30.91 1.06
adult_circulatory 24.60 5.00 2.73
adult_cns 12.68 -32.68 1.18
adult_midgut 22.27 7.37 -0.31
adult_nervous 16.47 -35.05 1.33
adult_sense_organ 9.52 3.75 2.76
antenna 18.67 -30.00 0.96
antennal_disc 7.86 -1.90 0.74
disc.mapping1 22.36 -6.67 2.24
disc.mapping2 22.23 3.24 1.98
ectoderm.mapping1 24.18 -12.41 1.47
ectoderm.mapping2 26.67 -21.11 1.08
emb-larv_fat_body 7.57 35.00 2.78
emb-larval_cns 0.23 -4.29 0.35
emb-larval_mushroombody 15.23 2.50 1.72
emb-larval_neuron 23.51 -31.78 0.30
emb-larval_opticlobe 29.35 4.83 1.84
embryonic_salivary 1.91 31.43 2.29
embryonic_ventral_nervous_system 27.27 -17.94 1.63
endoderm.mapping1 29.93 -5.38 2.02
eye_disc 18.57 2.86 2.13
eye-antennal_disc 22.21 -2.22 3.26
eye.mapping2 30.94 -19.69 2.02
female_reproductive 18.67 8.00 2.94
genital_disc 28.00 -4.29 1.40
glia 24.81 -3.64 1.40
glia.mapping1 21.17 2.50 2.00
glia.mapping2 22.68 0.48 2.19
gonad 30.24 -34.74 3.09
imaginal_disc 20.44 -16.06 1.94
leg 25.76 0.00 1.82
mesoderm.mapping2 35.07 -31.10 2.16
neuron 23.04 -34.81 0.52
pns.mapping1 22.78 5.28 2.01
reproductive.mapping2 29.73 -35.65 3.33
somatic_muscle.mapping1 29.34 7.83 3.57
trachea.mapping1 21.62 1.67 1.55
visceral.mapping1 27.43 6.34 2.78
wing.mapping2 22.02 -0.61 2.02
#write.table(poorTsetOverAll,file="~/Box/NewTrainingSetsJan2020/R/poorTsetOverAll.txt",sep = "\t")

2.3.2 Top Elbow of Amplitude curve

#random 29
top_fake <-read.table("/Users/hasibaasma/Box/NewTrainingSetsJan2020/R/random62oldRun/old/del_randomMEDIAN62sets_75times_Excel_withSpec.txt",header = T)

top_orig_newMed<- top_elbow_imm

s1 <- top_orig_newMed[order(top_orig_newMed$TsetName),]
s2 <- top_fake[order(top_fake$TsetName),]
subsetIMMevaluationOutputContRand1000_Actual1000_diff<- cbind.data.frame(s1$TsetName,s1$Method,s1$TsetSize,s1$SCRMs,s1$TrainingSetRecovered,s1$PercentageTrainingSetSensitivity,s1$REDflyRecovered,s1$PercentageRedflyRecovered,s1$percentageExpressionPatternRecall-s1$ExpectedpercentageExpressionPatternRecall,s2$SCRMs,s2$TrainingSetRecovered,s2$PercentageTrainingSetSensitivity,s2$REDflyRecovered,s2$PercentageRedflyRecovered,s1$PercentageTrainingSetSensitivity - s2$PercentageTrainingSetSensitivity,s1$PercentageRedflyRecovered - s2$PercentageRedflyRecovered)
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[1] <-"TsetName"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[4] <-"SCRMs"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[9] <-"DifferenceBetweenActualRandomPercentageSpecificity"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[15] <- "DifferenceBetweenActualRandomTsetRecovery"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[16] <- "DifferenceBetweenActualRandomRedflyRecovery"

##checking..creating ranges of differences to write in the observations
minimum_differenceBetweenActualRandomRedflyRecovery <- list()
maximum_differenceBetweenActualRandomRedflyRecovery <- list()
median_differenceBetweenActualRandomRedflyRecovery <- list()
minimum_DifferenceBetweenActualTsetRecovery<- list()
maximum_DifferenceBetweenActualTsetRecovery<- list()
median_DifferenceBetweenActualTsetRecovery<- list()
minimum_specificity<- list()
maximum_specificity<- list()
median_specificity<- list()
for(i in unique(subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName)){
  minimum_differenceBetweenActualRandomRedflyRecovery[[i]] <- min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_differenceBetweenActualRandomRedflyRecovery[[i]] <- median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_differenceBetweenActualRandomRedflyRecovery[[i]] <-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  minimum_DifferenceBetweenActualTsetRecovery[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_DifferenceBetweenActualTsetRecovery[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_DifferenceBetweenActualTsetRecovery[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
  minimum_specificity[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_specificity[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_specificity[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
}
library(qdapTools)
minred <- list2df(minimum_differenceBetweenActualRandomRedflyRecovery,"minimum difference to random in redfly recovery","TsetName")
medred <-list2df(median_differenceBetweenActualRandomRedflyRecovery,"median difference to random in redfly recovery","TsetName")
maxred<-list2df(maximum_differenceBetweenActualRandomRedflyRecovery,"maximum difference to random in redfly recovery","TsetName")
minTset<-list2df(minimum_DifferenceBetweenActualTsetRecovery,"minimum difference to random in Tset sensitivity","TsetName")
maxTset<- list2df(maximum_DifferenceBetweenActualTsetRecovery,"maximum difference to random in Tset sensitivity","TsetName")
medTset<-list2df(median_DifferenceBetweenActualTsetRecovery,"median difference to random in Tset sensitivity","TsetName")
minSpec<-list2df(minimum_specificity,"minimum difference to random in specificity","TsetName")
maxSpec<-list2df(maximum_specificity,"maximum difference to random in specificity","TsetName")
medSpec <-list2df(median_specificity,"median difference to random in specificity","TsetName")

df1<- merge(medred,medTset,by="TsetName")
dffinal<-merge(df1,medSpec,by="TsetName")
#dffinal<-merge(df7,by="TsetName")




#data.frame(unclass(table(dffinal)))
#table(dffinal)
write.table(dffinal,file="~/Box/Old_and_newTsets_3postProcMethods_3categories/newTsets/finaltable_new74sets_elbowAmplitudeCurve.txt",sep = "\t")

library(knitr)
library(kableExtra)
library(dplyr)

2.3.2.1 Good Tsets

Conditions being set for these good Tsets include:

  • Difference To Random in Redfly Recovery must be greater than or equal to 8percent at cutoff,
  • Difference to Random in Tset Sensitivity must be greater than or equalt to 8percent,
  • Difference to Expected in Specificity must be greater than or equalt to 4percent
finaltable<- list()
# 
#setting up conditions Good Tsets OverAll
dffinalDF <- as.data.frame(dffinal)


colnames(dffinalDF) <-c("TsetName","MedianDifferenceToRandomOfRedflyRecovery","MedianDifferenceInTsetSensitivity","MedianDifferenceToPermutedInTsetSpecificity")


goodTsetOverAll<- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8,  MedianDifferenceInTsetSensitivity >=8,MedianDifferenceToPermutedInTsetSpecificity >=4)



goodTsetsOverAll <- list()
colnames(goodTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to permuted in Tset Specificity at cutoff")

goodTsetOverAll<- cbind(goodTsetOverAll[1],goodTsetOverAll[2],goodTsetOverAll[3],goodTsetOverAll[4])
kable_input3<- kable(goodTsetOverAll,digits = 2,caption = "Overall Good Training Sets with TS and RR > 8 and SP > 4") 
#column_spec(kable_input3,2:10,width = "2cm")
kable_input3
Overall Good Training Sets with TS and RR > 8 and SP > 4
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to permuted in Tset Specificity at cutoff
adult_muscle 38.49 28.75 11.11
adult_somatic_muscle 41.32 20.00 12.20
blastoderm.mapping1 47.60 22.50 6.62
#write.table(goodTsetOverAll,file="~/Box/NewTrainingSetsJan2020/R/goodTsetOverAll.txt",sep = "\t")

2.3.2.2 Good Tsets Ignoring Specificity

Conditions being set for these good Tsets(if we ignore specificity) include:

  • Difference To Random in redfly recovery must be greater than or equal to 10percent at cutoff,
  • Difference In Tset Sensitivity to random must be greater than or equalt to 10.
goodTsetsIgnoringSpecificity <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=10,MedianDifferenceInTsetSensitivity>=10)
colnames(goodTsetsIgnoringSpecificity) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to permuted in Tset Specificity at cutoff")
goodTsetsIgnoringSpecificity <- cbind(goodTsetsIgnoringSpecificity[1],goodTsetsIgnoringSpecificity[2],goodTsetsIgnoringSpecificity[3],goodTsetsIgnoringSpecificity[4])
kable_input4<- kable(goodTsetsIgnoringSpecificity,digits = 2,caption = "Good Training Sets Ignoring Poor Specificity both > 10")
#column_spec(kable_input4,2:10,width = "2cm")
kable_input4
Good Training Sets Ignoring Poor Specificity both > 10
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to permuted in Tset Specificity at cutoff
adult_muscle 38.49 28.75 11.11
adult_somatic_muscle 41.32 20.00 12.20
blastoderm.mapping1 47.60 22.50 6.62
emb-larv_fat_body 56.40 28.75 2.78
emb-larv_hindgut 42.66 38.57 2.77
embryonic_salivary 11.61 31.43 -0.29
eye.mapping1 56.40 20.00 3.18
haltere_disc 45.48 15.56 0.24
myoblast 48.23 15.56 0.94
ventral_ectoderm.mapping1 45.71 21.90 1.57
#write.table(goodTsetsIgnoringSpecificity,file="~/Box/NewTrainingSetsJan2020/R/goodTsetsIgnoringSpecificity.txt",sep = "\t")

2.3.2.3 Intermediate Tsets

These are the sets basically, that do not fall into very good or very bad training sets, mainly because they perform good in two measures but not in the third one. Not anymore , Excluded those sets which were already categorized as good.

Specific conditions being set for these Intermediate sets include:

  • Difference To Random in redfly recovery must be grater than 8 and Difference to random in Tset Sensitivity must be greater than 8 but that of specificity should just be greater than 0 , OR
  • Difference To Random in redfly recovery must be greater than 8 and specificity must be greater than 4 but Difference to random in Tset Sensitivity should just be greater than 0 , OR
  • Difference To random in Tset Sensitivity and specificity must be greater than 8 and 4 respectively but Difference to random in redfly recovery should just be greater than 0.
intermediateTSets <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=8 &  MedianDifferenceToPermutedInTsetSpecificity >=0 | MedianDifferenceToRandomOfRedflyRecovery >=0 & MedianDifferenceInTsetSensitivity >=8 &  MedianDifferenceToPermutedInTsetSpecificity >=4 |MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=0 &  MedianDifferenceToPermutedInTsetSpecificity >=4 )
onlyintermediateTSets=data.frame()

# onlyintermediateTSetsNum <-    which(!(intermediateTSets$TsetName %in% goodTsetOverAll$TsetName))
# for (i in 1:length(onlyintermediateTSetsNum)){
# onlyintermediateTSetIter <- intermediateTSets[onlyintermediateTSetsNum[i],]
# onlyintermediateTSets <- rbind(onlyintermediateTSets,onlyintermediateTSetIter)       
# 
# }



#intermediateTSets <- onlyintermediateTSets
colnames(intermediateTSets) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
intermediateTSets <- cbind(intermediateTSets[1],intermediateTSets[2],intermediateTSets[3],intermediateTSets[4])
kable_input41<- kable(intermediateTSets,digits = 2,caption = "Intermediate Tsets with all 3 interm") 
#column_spec(kable_input41,2:10,width = "3cm")
kable_input41
Intermediate Tsets with all 3 interm
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
adult_mesoderm.mapping1 47.22 6.67 7.66
adult_muscle 38.49 28.75 11.11
adult_somatic_muscle 41.32 20.00 12.20
blastoderm.mapping1 47.60 22.50 6.62
cardiac.mapping1 43.46 10.00 2.73
emb-larv_fat_body 56.40 28.75 2.78
emb-larv_hindgut 42.66 38.57 2.77
embryonic_sense_organ 3.26 21.90 4.87
embryonic_trachea 27.14 10.00 1.30
eye.mapping1 56.40 20.00 3.18
fat_body.mapping1 46.37 10.00 1.55
haltere_disc 45.48 15.56 0.24
myoblast 48.23 15.56 0.94
ventral_ectoderm.mapping1 45.71 21.90 1.57
#write.table(intermediateTSets,file="~/Box/NewTrainingSetsJan2020/R/intermediateTSets.txt",sep = "\t")

2.3.2.4 Poor Tsets

Conditions being set for these poor sets include. If a set is poor in two categories then its a poor set:

  • Difference To Random in redfly recovery is less than or equalt to 8 and Difference To Random in Tset Sensitivity is also less than or equalt to 8 OR ,
  • Difference To Random in Tset Sensitivity is less than or equalt to 8 and Difference To Permuted in Tset Specificity is also less than or equalt to 4 OR,
  • Difference Random in redfly recovery is less than or equalt to 8 and Difference To Permuted in Tset Specificity is also less than or equalt to 4 ,
#poor Tsets

poorTsetOverAll<- filter(dffinalDF, MedianDifferenceToRandomOfRedflyRecovery <=8 & MedianDifferenceInTsetSensitivity <= 8 | MedianDifferenceInTsetSensitivity <= 8 & MedianDifferenceToPermutedInTsetSpecificity<=4 |MedianDifferenceToPermutedInTsetSpecificity<=4 & MedianDifferenceToRandomOfRedflyRecovery <=8 )

colnames(poorTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

poorTsetOverAll<- cbind(poorTsetOverAll[1],poorTsetOverAll[2],poorTsetOverAll[3],poorTsetOverAll[4])
kable_input6<- kable(poorTsetOverAll,digits = 2,caption = "Overall Poor Training Sets with any one below 0")
#column_spec(kable_input6,2:10,width = "2cm")
kable_input6
Overall Poor Training Sets with any one below 0
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
adult_brain 14.42 -36.10 0.81
adult_circulatory 40.87 -5.00 0.00
adult_cns 16.33 -36.34 0.89
adult_foregut 47.22 0.00 -0.48
adult_midgut 45.39 -3.16 -0.15
adult_nervous 18.42 -37.03 0.91
adult_pns 38.41 4.00 3.49
adult_sense_organ 25.33 -12.92 1.15
amnioserosa.mapping1 32.63 -0.53 3.02
antenna 26.91 -37.50 0.42
antennal_disc 9.44 -6.67 0.43
antennal_lobe 31.54 1.94 0.43
disc.mapping1 25.27 -17.27 0.75
disc.mapping2 35.75 -4.86 0.37
dorsal_ectoderm.mapping1 36.72 6.87 1.58
ectoderm.mapping1 29.23 -21.03 1.13
ectoderm.mapping2 29.95 -31.11 0.62
emb-larv_circulatory_system 47.49 4.00 0.94
emb-larv_excretory 46.89 -4.29 0.45
emb-larv_foregut 30.50 -1.11 1.27
emb-larv_visceral 32.35 4.44 0.23
emb-larval_cns 2.78 -13.21 -0.06
emb-larval_mushroombody 20.55 -5.00 1.37
emb-larval_neuron 32.79 -35.89 0.05
emb-larval_opticlobe 36.67 -2.07 1.27
embryonic_epidermis 45.67 6.67 2.80
embryonic_midgut 36.11 1.18 1.34
embryonic_muscle 49.19 5.16 2.92
embryonic_pns 32.18 -11.43 2.28
embryonic_somatic_muscle 34.78 8.00 2.61
embryonic_ventral_nervous_system 33.67 -25.29 0.60
endoderm.mapping1 35.43 -22.69 0.53
eye_disc 29.63 -5.71 1.72
eye-antennal_disc 33.94 -8.89 1.88
eye.mapping2 41.34 -24.38 0.87
female_gonad.mapping1 42.36 -12.41 1.87
female_reproductive 25.26 -4.00 1.56
genital_disc 30.98 -7.86 2.06
glia 26.86 -15.76 0.86
glia.mapping1 31.62 -2.50 1.94
glia.mapping2 27.32 -4.29 1.78
gonad 38.87 -34.74 2.57
imaginal_disc 26.23 -23.10 0.95
leg 27.90 -7.50 0.44
leg_disc 21.94 -1.54 0.52
male_reproductive 32.16 -26.96 3.92
malpig.mapping1 51.65 -2.50 2.15
mesectoderm.mapping1 47.08 5.00 3.38
mesoderm.mapping1 39.21 6.67 1.43
mesoderm.mapping2 39.32 -36.58 1.05
neuron 31.16 -36.10 0.02
pns.mapping1 27.41 -13.58 0.66
reproductive.mapping2 32.42 -35.65 2.15
salivary.mapping1 33.61 -18.95 1.27
somatic_muscle.mapping1 35.94 -7.39 2.24
trachea.mapping1 35.75 -2.50 1.20
visceral.mapping1 37.11 -0.98 1.95
wing.mapping2 21.48 -6.67 0.56
#write.table(poorTsetOverAll,file="~/Box/NewTrainingSetsJan2020/R/poorTsetOverAll.txt",sep = "\t")

2.3.3 Top None of Amplitude curve

#random 29
top_fake <-read.table("/Users/hasibaasma/Box/NewTrainingSetsJan2020/R/random62oldRun/old/del_randomMEDIAN62sets_75times_Excel_withSpec.txt",header = T)

top_orig_newMed<- top_none_imm

s1 <- top_orig_newMed[order(top_orig_newMed$TsetName),]
s2 <- top_fake[order(top_fake$TsetName),]
subsetIMMevaluationOutputContRand1000_Actual1000_diff<- cbind.data.frame(s1$TsetName,s1$Method,s1$TsetSize,s1$SCRMs,s1$TrainingSetRecovered,s1$PercentageTrainingSetSensitivity,s1$REDflyRecovered,s1$PercentageRedflyRecovered,s1$percentageExpressionPatternRecall-s1$ExpectedpercentageExpressionPatternRecall,s2$SCRMs,s2$TrainingSetRecovered,s2$PercentageTrainingSetSensitivity,s2$REDflyRecovered,s2$PercentageRedflyRecovered,s1$PercentageTrainingSetSensitivity - s2$PercentageTrainingSetSensitivity,s1$PercentageRedflyRecovered - s2$PercentageRedflyRecovered)
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[1] <-"TsetName"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[4] <-"SCRMs"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[9] <-"DifferenceBetweenActualRandomPercentageSpecificity"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[15] <- "DifferenceBetweenActualRandomTsetRecovery"
colnames(subsetIMMevaluationOutputContRand1000_Actual1000_diff)[16] <- "DifferenceBetweenActualRandomRedflyRecovery"

##checking..creating ranges of differences to write in the observations
minimum_differenceBetweenActualRandomRedflyRecovery <- list()
maximum_differenceBetweenActualRandomRedflyRecovery <- list()
median_differenceBetweenActualRandomRedflyRecovery <- list()
minimum_DifferenceBetweenActualTsetRecovery<- list()
maximum_DifferenceBetweenActualTsetRecovery<- list()
median_DifferenceBetweenActualTsetRecovery<- list()
minimum_specificity<- list()
maximum_specificity<- list()
median_specificity<- list()
for(i in unique(subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName)){
  minimum_differenceBetweenActualRandomRedflyRecovery[[i]] <- min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_differenceBetweenActualRandomRedflyRecovery[[i]] <- median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_differenceBetweenActualRandomRedflyRecovery[[i]] <-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomRedflyRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  minimum_DifferenceBetweenActualTsetRecovery[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_DifferenceBetweenActualTsetRecovery[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_DifferenceBetweenActualTsetRecovery[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomTsetRecovery[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
  minimum_specificity[[i]]<-min(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  maximum_specificity[[i]]<-max(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  median_specificity[[i]]<-median(subsetIMMevaluationOutputContRand1000_Actual1000_diff$DifferenceBetweenActualRandomPercentageSpecificity[subsetIMMevaluationOutputContRand1000_Actual1000_diff$TsetName==i])*100
  
}
library(qdapTools)
minred <- list2df(minimum_differenceBetweenActualRandomRedflyRecovery,"minimum difference to random in redfly recovery","TsetName")
medred <-list2df(median_differenceBetweenActualRandomRedflyRecovery,"median difference to random in redfly recovery","TsetName")
maxred<-list2df(maximum_differenceBetweenActualRandomRedflyRecovery,"maximum difference to random in redfly recovery","TsetName")
minTset<-list2df(minimum_DifferenceBetweenActualTsetRecovery,"minimum difference to random in Tset sensitivity","TsetName")
maxTset<- list2df(maximum_DifferenceBetweenActualTsetRecovery,"maximum difference to random in Tset sensitivity","TsetName")
medTset<-list2df(median_DifferenceBetweenActualTsetRecovery,"median difference to random in Tset sensitivity","TsetName")
minSpec<-list2df(minimum_specificity,"minimum difference to random in specificity","TsetName")
maxSpec<-list2df(maximum_specificity,"maximum difference to random in specificity","TsetName")
medSpec <-list2df(median_specificity,"median difference to random in specificity","TsetName")

df1<- merge(medred,medTset,by="TsetName")
dffinal<-merge(df1,medSpec,by="TsetName")
#dffinal<-merge(df7,by="TsetName")




#data.frame(unclass(table(dffinal)))
#table(dffinal)
write.table(dffinal,file="~/Box/Old_and_newTsets_3postProcMethods_3categories/newTsets/finaltable_new74sets_noneAmplitudeCurve.txt",sep = "\t")

library(knitr)
library(kableExtra)
library(dplyr)

2.3.3.1 Good Tsets

Conditions being set for these good Tsets include:

  • Difference To Random in Redfly Recovery must be greater than or equal to 8percent at cutoff,
  • Difference to Random in Tset Sensitivity must be greater than or equalt to 8percent,
  • Difference to Expected in Specificity must be greater than or equalt to 4percent
finaltable<- list()
# 
#setting up conditions Good Tsets OverAll
dffinalDF <- as.data.frame(dffinal)


colnames(dffinalDF) <-c("TsetName","MedianDifferenceToRandomOfRedflyRecovery","MedianDifferenceInTsetSensitivity","MedianDifferenceToPermutedInTsetSpecificity")


goodTsetOverAll<- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8,  MedianDifferenceInTsetSensitivity >=8,MedianDifferenceToPermutedInTsetSpecificity >=4)



goodTsetsOverAll <- list()
colnames(goodTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to permuted in Tset Specificity at cutoff")

goodTsetOverAll<- cbind(goodTsetOverAll[1],goodTsetOverAll[2],goodTsetOverAll[3],goodTsetOverAll[4])
kable_input3<- kable(goodTsetOverAll,digits = 2,caption = "Overall Good Training Sets with TS and RR > 8 and SP > 4") 
#column_spec(kable_input3,2:10,width = "2cm")
kable_input3
Overall Good Training Sets with TS and RR > 8 and SP > 4
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to permuted in Tset Specificity at cutoff
adult_mesoderm.mapping1 27.08 26.67 10.65
adult_muscle 23.12 35.00 10.89
adult_somatic_muscle 24.25 20.00 11.71
amnioserosa.mapping1 27.46 10.00 5.78
blastoderm.mapping1 17.27 35.00 14.10
emb-larv_excretory 26.98 17.14 6.25
emb-larv_fat_body 9.51 41.25 5.00
emb-larv_hindgut 19.44 45.71 5.11
embryonic_muscle 26.20 27.74 5.00
embryonic_pns 20.71 17.14 5.28
embryonic_somatic_muscle 23.45 36.00 4.67
embryonic_trachea 17.90 22.50 4.07
eye_disc 16.49 8.57 4.40
eye.mapping1 23.31 30.00 4.11
malpig.mapping1 27.28 10.00 6.92
mesectoderm.mapping1 20.23 30.00 6.47
mesoderm.mapping1 22.59 24.44 4.96
somatic_muscle.mapping1 25.21 10.00 6.24
visceral.mapping1 24.61 13.66 5.58
#write.table(goodTsetOverAll,file="~/Box/NewTrainingSetsJan2020/R/goodTsetOverAll.txt",sep = "\t")

2.3.3.2 Good Tsets Ignoring Specificity

Conditions being set for these good Tsets(if we ignore specificity) include:

  • Difference To Random in redfly recovery must be greater than or equal to 10percent at cutoff,
  • Difference In Tset Sensitivity to random must be greater than or equalt to 10.
goodTsetsIgnoringSpecificity <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=10,MedianDifferenceInTsetSensitivity>=10)
colnames(goodTsetsIgnoringSpecificity) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to permuted in Tset Specificity at cutoff")
goodTsetsIgnoringSpecificity <- cbind(goodTsetsIgnoringSpecificity[1],goodTsetsIgnoringSpecificity[2],goodTsetsIgnoringSpecificity[3],goodTsetsIgnoringSpecificity[4])
kable_input4<- kable(goodTsetsIgnoringSpecificity,digits = 2,caption = "Good Training Sets Ignoring Poor Specificity both > 10")
#column_spec(kable_input4,2:10,width = "2cm")
kable_input4
Good Training Sets Ignoring Poor Specificity both > 10
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to permuted in Tset Specificity at cutoff
adult_mesoderm.mapping1 27.08 26.67 10.65
adult_muscle 23.12 35.00 10.89
adult_somatic_muscle 24.25 20.00 11.71
antennal_lobe 16.42 11.61 1.43
blastoderm.mapping1 17.27 35.00 14.10
cardiac.mapping1 25.05 13.57 3.97
disc.mapping2 19.82 11.35 3.00
dorsal_ectoderm.mapping1 18.97 19.37 3.65
emb-larv_circulatory_system 23.35 16.00 3.37
emb-larv_excretory 26.98 17.14 6.25
emb-larv_hindgut 19.44 45.71 5.11
emb-larv_visceral 21.06 15.56 0.91
embryonic_midgut 27.57 18.82 2.62
embryonic_muscle 26.20 27.74 5.00
embryonic_pns 20.71 17.14 5.28
embryonic_somatic_muscle 23.45 36.00 4.67
embryonic_trachea 17.90 22.50 4.07
eye.mapping1 23.31 30.00 4.11
female_reproductive 11.33 12.00 3.33
haltere_disc 23.55 15.56 1.51
leg_disc 14.53 13.85 2.31
mesectoderm.mapping1 20.23 30.00 6.47
mesoderm.mapping1 22.59 24.44 4.96
myoblast 15.69 26.67 1.15
ventral_ectoderm.mapping1 11.45 21.90 2.06
visceral.mapping1 24.61 13.66 5.58
#write.table(goodTsetsIgnoringSpecificity,file="~/Box/NewTrainingSetsJan2020/R/goodTsetsIgnoringSpecificity.txt",sep = "\t")

2.3.3.3 Intermediate Tsets

These are the sets basically, that do not fall into very good or very bad training sets, mainly because they perform good in two measures but not in the third one. Not anymore , Excluded those sets which were already categorized as good.

Specific conditions being set for these Intermediate sets include:

  • Difference To Random in redfly recovery must be grater than 8 and Difference to random in Tset Sensitivity must be greater than 8 but that of specificity should just be greater than 0 , OR
  • Difference To Random in redfly recovery must be greater than 8 and specificity must be greater than 4 but Difference to random in Tset Sensitivity should just be greater than 0 , OR
  • Difference To random in Tset Sensitivity and specificity must be greater than 8 and 4 respectively but Difference to random in redfly recovery should just be greater than 0.
intermediateTSets <- filter(dffinalDF,MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=8 &  MedianDifferenceToPermutedInTsetSpecificity >=0 | MedianDifferenceToRandomOfRedflyRecovery >=0 & MedianDifferenceInTsetSensitivity >=8 &  MedianDifferenceToPermutedInTsetSpecificity >=4 |MedianDifferenceToRandomOfRedflyRecovery >=8 & MedianDifferenceInTsetSensitivity >=0 &  MedianDifferenceToPermutedInTsetSpecificity >=4 )
onlyintermediateTSets=data.frame()

# onlyintermediateTSetsNum <-    which(!(intermediateTSets$TsetName %in% goodTsetOverAll$TsetName))
# for (i in 1:length(onlyintermediateTSetsNum)){
# onlyintermediateTSetIter <- intermediateTSets[onlyintermediateTSetsNum[i],]
# onlyintermediateTSets <- rbind(onlyintermediateTSets,onlyintermediateTSetIter)       
# 
# }



#intermediateTSets <- onlyintermediateTSets
colnames(intermediateTSets) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")
intermediateTSets <- cbind(intermediateTSets[1],intermediateTSets[2],intermediateTSets[3],intermediateTSets[4])
kable_input41<- kable(intermediateTSets,digits = 2,caption = "Intermediate Tsets with all 3 interm") 
#column_spec(kable_input41,2:10,width = "3cm")
kable_input41
Intermediate Tsets with all 3 interm
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
adult_mesoderm.mapping1 27.08 26.67 10.65
adult_muscle 23.12 35.00 10.89
adult_somatic_muscle 24.25 20.00 11.71
amnioserosa.mapping1 27.46 10.00 5.78
antennal_lobe 16.42 11.61 1.43
blastoderm.mapping1 17.27 35.00 14.10
cardiac.mapping1 25.05 13.57 3.97
disc.mapping2 19.82 11.35 3.00
dorsal_ectoderm.mapping1 18.97 19.37 3.65
emb-larv_circulatory_system 23.35 16.00 3.37
emb-larv_excretory 26.98 17.14 6.25
emb-larv_fat_body 9.51 41.25 5.00
emb-larv_foregut 21.61 7.22 6.17
emb-larv_hindgut 19.44 45.71 5.11
emb-larv_visceral 21.06 15.56 0.91
emb-larval_opticlobe 24.16 8.28 3.04
embryonic_epidermis 32.66 6.67 6.50
embryonic_midgut 27.57 18.82 2.62
embryonic_muscle 26.20 27.74 5.00
embryonic_pns 20.71 17.14 5.28
embryonic_sense_organ 0.44 26.67 9.61
embryonic_somatic_muscle 23.45 36.00 4.67
embryonic_trachea 17.90 22.50 4.07
eye_disc 16.49 8.57 4.40
eye-antennal_disc 18.65 2.22 4.70
eye.mapping1 23.31 30.00 4.11
fat_body.mapping1 7.47 35.00 4.31
female_gonad.mapping1 14.79 1.38 4.13
female_reproductive 11.33 12.00 3.33
haltere_disc 23.55 15.56 1.51
leg_disc 14.53 13.85 2.31
malpig.mapping1 27.28 10.00 6.92
mesectoderm.mapping1 20.23 30.00 6.47
mesoderm.mapping1 22.59 24.44 4.96
myoblast 15.69 26.67 1.15
pns.mapping1 20.50 9.06 3.17
salivary.mapping1 23.04 2.11 7.51
somatic_muscle.mapping1 25.21 10.00 6.24
ventral_ectoderm.mapping1 11.45 21.90 2.06
visceral.mapping1 24.61 13.66 5.58
wing.mapping2 18.33 8.48 3.40
#write.table(intermediateTSets,file="~/Box/NewTrainingSetsJan2020/R/intermediateTSets.txt",sep = "\t")

2.3.3.4 Poor Tsets

Conditions being set for these poor sets include. If a set is poor in two categories then its a poor set:

  • Difference To Random in redfly recovery is less than or equalt to 8 and Difference To Random in Tset Sensitivity is also less than or equalt to 8 OR ,
  • Difference To Random in Tset Sensitivity is less than or equalt to 8 and Difference To Permuted in Tset Specificity is also less than or equalt to 4 OR,
  • Difference Random in redfly recovery is less than or equalt to 8 and Difference To Permuted in Tset Specificity is also less than or equalt to 4 ,
#poor Tsets

poorTsetOverAll<- filter(dffinalDF, MedianDifferenceToRandomOfRedflyRecovery <=8 & MedianDifferenceInTsetSensitivity <= 8 | MedianDifferenceInTsetSensitivity <= 8 & MedianDifferenceToPermutedInTsetSpecificity<=4 |MedianDifferenceToPermutedInTsetSpecificity<=4 & MedianDifferenceToRandomOfRedflyRecovery <=8 )

colnames(poorTsetOverAll) <-c("TsetName","Difference to random in redfly recovery at cutoff","Difference in Tset Sensitivity to random at cutoff","Difference to random in Tset Specificity at cutoff")

poorTsetOverAll<- cbind(poorTsetOverAll[1],poorTsetOverAll[2],poorTsetOverAll[3],poorTsetOverAll[4])
kable_input6<- kable(poorTsetOverAll,digits = 2,caption = "Overall Poor Training Sets with any one below 0")
#column_spec(kable_input6,2:10,width = "2cm")
kable_input6
Overall Poor Training Sets with any one below 0
TsetName Difference to random in redfly recovery at cutoff Difference in Tset Sensitivity to random at cutoff Difference to random in Tset Specificity at cutoff
adult_brain 9.48 -25.71 2.06
adult_circulatory 21.46 5.00 3.03
adult_cns 10.81 -27.80 1.69
adult_midgut 21.91 7.37 -1.54
adult_nervous 11.96 -32.08 2.10
adult_sense_organ 6.89 10.00 3.85
antenna 17.32 -23.75 1.58
antennal_disc 6.07 2.86 0.79
disc.mapping1 19.35 -5.15 3.39
ectoderm.mapping1 21.87 -7.24 1.60
ectoderm.mapping2 23.02 -15.56 2.19
emb-larval_cns -2.90 1.07 0.37
emb-larval_mushroombody 10.33 5.00 3.51
emb-larval_neuron 17.51 -29.04 0.37
embryonic_ventral_nervous_system 19.30 -12.06 2.17
endoderm.mapping1 24.43 -3.46 3.04
eye.mapping2 26.70 -10.31 3.85
genital_disc 22.82 6.43 1.42
glia 19.27 2.42 1.36
glia.mapping1 17.07 5.00 3.61
glia.mapping2 17.42 2.86 2.65
imaginal_disc 13.54 -16.06 3.27
leg 21.55 5.00 3.95
mesoderm.mapping2 28.51 -29.04 3.76
neuron 16.04 -32.21 0.43
trachea.mapping1 18.05 5.83 0.74
#write.table(poorTsetOverAll,file="~/Box/NewTrainingSetsJan2020/R/poorTsetOverAll.txt",sep = "\t")